In [1]:
import pandas as pd 
from gensim import corpora, models, similarities

In [2]:
review = pd.read_pickle('../output/bar_reviews_cleaned_and_tokenized.pickle')

Merge a bar's reviews into a single document


In [23]:
from itertools import chain
from collections import OrderedDict
reviews_merged = OrderedDict()

# Flatten the reviews, so each review is just a single list of words.


n_reviews = -1

for bus_id in set(review.business_id.values[:n_reviews]):
    # This horrible line first collapses each review of a corresponding business into a list
    # of lists, and then collapses the list of sentences to a long list of words
    reviews_merged[bus_id] = list(chain.from_iterable( 
                                    chain.from_iterable( review.cleaned_tokenized[review.business_id==bus_id] )))

Now we must generate a dictionary which maps vocabulary into a number


In [24]:
import time 
from itertools import chain

print 'Generating vector dictionary....'
 # Review level LDA
# review_flatten = list(chain.from_iterable(review.cleaned_tokenized.iloc[:])) 
# id2word_wiki = corpora.Dictionary(review_flatten)


start = time.time()

# Business level LDA (all reviews for a business merged)
id2word_wiki = corpora.Dictionary(reviews_merged.values())

print 'Dictonary generated in %1.2f seconds'%(time.time()-start)


Generating vector dictionary....

KeyboardInterruptTraceback (most recent call last)
<ipython-input-24-8de2f3cce048> in <module>()
     11 
     12 # Business level LDA (all reviews for a business merged)
---> 13 id2word_wiki = corpora.Dictionary(reviews_merged.values())
     14 
     15 print 'Dictonary generated in %1.2f seconds'%(time.time()-start)

/home/carlson/anaconda/envs/insight/lib/python2.7/site-packages/gensim/corpora/dictionary.pyc in __init__(self, documents, prune_at)
     56 
     57         if documents is not None:
---> 58             self.add_documents(documents, prune_at=prune_at)
     59 
     60     def __getitem__(self, tokenid):

/home/carlson/anaconda/envs/insight/lib/python2.7/site-packages/gensim/corpora/dictionary.pyc in add_documents(self, documents, prune_at)
    117 
    118             # update Dictionary with the document
--> 119             self.doc2bow(document, allow_update=True)  # ignore the result, here we only care about updating token ids
    120 
    121         logger.info(

/home/carlson/anaconda/envs/insight/lib/python2.7/site-packages/gensim/corpora/dictionary.pyc in doc2bow(self, document, allow_update, return_missing)
    144         counter = defaultdict(int)
    145         for w in document:
--> 146             counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
    147 
    148         token2id = self.token2id

KeyboardInterrupt: 

In [ ]:
# Convert corpus to bag of words for use with gensim...
# See https://radimrehurek.com/gensim/tut1.html#from-strings-to-vectors
#corpus = map(lambda doc: id2word_wiki.doc2bow(doc), review_flatten)
corpus = map(lambda doc: id2word_wiki.doc2bow(doc), reviews_merged.values())
corpora.MmCorpus.serialize('../output/bar_corpus.mm', corpus)


# Can load the corpus with 
# from gensim import corpora
# corpus = corpora.MmCorpus('../output/bar_corpus.mm')

In [ ]:
import gensim
print 'Fitting LDA Model'
start = time.time()
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, 
                                           id2word=id2word_wiki, passes=5,)
print 'LDA fit in %1.2f seconds'%(time.time()-start)

In [ ]:
for topic in ldamodel.print_topics(num_topics=10, num_words=8): 
    print topic

In [50]:
from sklearn.decomposition import LatentDirichletAllocation, nmf


lda = LatentDirichletAllocation(n_topics=10, evaluate_every=1000, n_jobs=12, verbose=True)

lda.fit(corpus[:2000])


/home/carlson/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)

ValueErrorTraceback (most recent call last)
<ipython-input-50-de4b63b55b40> in <module>()
      4 lda = LatentDirichletAllocation(n_topics=10, evaluate_every=1000, n_jobs=12, verbose=True)
      5 
----> 6 lda.fit(corpus[:2000])

/home/carlson/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/decomposition/online_lda.pyc in fit(self, X, y)
    492         """
    493         self._check_params()
--> 494         X = self._check_non_neg_array(X, "LatentDirichletAllocation.fit")
    495         n_samples, n_features = X.shape
    496         max_iter = self.max_iter

/home/carlson/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/decomposition/online_lda.pyc in _check_non_neg_array(self, X, whom)
    434 
    435         """
--> 436         X = check_array(X, accept_sparse='csr')
    437         check_non_negative(X, whom)
    438         return X

/home/carlson/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    391         # make sure we acually converted to numeric:
    392         if dtype_numeric and array.dtype.kind == "O":
--> 393             array = array.astype(np.float64)
    394         if not allow_nd and array.ndim >= 3:
    395             raise ValueError("Found array with dim %d. %s expected <= 2."

ValueError: setting an array element with a sequence.

In [ ]: